In [9]:
import math
import numpy
In [10]:
dataset = [['dugg', 'clare', 'will', 'donald', 'deril', 'gregory', 'julia'],
['M', 'F', 'M', 'M', 'M', 'M', 'F'],
['20-30', '20-30', '20-30', '20-30', '30-40', '20-30', '5-10'],
['1.60-1.70', '1.70-1.80', '1.70-1.80', '1.80-1.90', '1.70-1.80', '>1.90', '<1.60']]
In [11]:
labels = ['no', 'no', 'yes', 'yes', 'no', 'no', 'yes']
In [12]:
def calc_entropy(labels):
entropy = 0.0
for i in set(labels):
q_labels = float(labels.count(i))/len(labels)
entropy += q_labels * math.log(1/q_labels)
return entropy
In [13]:
def calc_split_entropy(splitted_labels):
all_labels = [i for sublist in splitted_labels for i in sublist]
new_entropy = 0.0
for i in splitted_labels:
q_split = float(len(i))/len(all_labels)
new_entropy += q_split * calc_entropy(i)
return new_entropy
In [14]:
def split_by(feature_index, dataset, labels):
splitted_labels = {}
for i, v in enumerate(dataset[feature_index]):
if not splitted_labels.has_key(v):
splitted_labels[v] = []
splitted_labels[v].append(labels[i])
return splitted_labels
In [15]:
def calc_variance(values):
return (len(set(values))/float(len(values)))
In [25]:
calc_split_entropy(split_by(3, dataset, labels).values())
Out[25]:
In [17]:
calc_entropy(labels)
Out[17]:
In [24]:
calc_variance(dataset[3])
Out[24]:
In [ ]: